#Import required libraries
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns ; sns.set()
import plotly.express as px
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler,PolynomialFeatures
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from matplotlib import pyplot as plt
from sklearn.cluster import KMeans
import sklearn.metrics as metrics
from sklearn.metrics import classification_report, confusion_matrix
from statistics import mean
from sklearn.linear_model import LinearRegression, Ridge, Lasso,SGDClassifier
from sklearn.pipeline import Pipeline
#Read the csv file
data = pd.read_csv("Covid Dataset.csv")
print(len(data))
data.head()
5434
| Breathing Problem | Fever | Dry Cough | Sore throat | Running Nose | Asthma | Chronic Lung Disease | Headache | Heart Disease | Diabetes | ... | Fatigue | Gastrointestinal | Abroad travel | Contact with COVID Patient | Attended Large Gathering | Visited Public Exposed Places | Family working in Public Exposed Places | Wearing Masks | Sanitization from Market | COVID-19 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Yes | Yes | Yes | Yes | Yes | No | No | No | No | Yes | ... | Yes | Yes | No | Yes | No | Yes | Yes | No | No | Yes |
| 1 | Yes | Yes | Yes | Yes | No | Yes | Yes | Yes | No | No | ... | Yes | No | No | No | Yes | Yes | No | No | No | Yes |
| 2 | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | No | Yes | ... | Yes | Yes | Yes | No | No | No | No | No | No | Yes |
| 3 | Yes | Yes | Yes | No | No | Yes | No | No | Yes | Yes | ... | No | No | Yes | No | Yes | Yes | No | No | No | Yes |
| 4 | Yes | Yes | Yes | Yes | Yes | No | Yes | Yes | Yes | Yes | ... | No | Yes | No | Yes | No | Yes | No | No | No | Yes |
5 rows × 21 columns
type(data)
pandas.core.frame.DataFrame
data.describe(include='all')
| Breathing Problem | Fever | Dry Cough | Sore throat | Running Nose | Asthma | Chronic Lung Disease | Headache | Heart Disease | Diabetes | ... | Fatigue | Gastrointestinal | Abroad travel | Contact with COVID Patient | Attended Large Gathering | Visited Public Exposed Places | Family working in Public Exposed Places | Wearing Masks | Sanitization from Market | COVID-19 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 5434 | 5434 | 5434 | 5434 | 5434 | 5434 | 5434 | 5434 | 5434 | 5434 | ... | 5434 | 5434 | 5434 | 5434 | 5434 | 5434 | 5434 | 5434 | 5434 | 5434 |
| unique | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | ... | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 1 | 1 | 2 |
| top | Yes | Yes | Yes | Yes | Yes | No | No | Yes | No | No | ... | Yes | No | No | Yes | No | Yes | No | No | No | Yes |
| freq | 3620 | 4273 | 4307 | 3953 | 2952 | 2920 | 2869 | 2736 | 2911 | 2846 | ... | 2821 | 2883 | 2983 | 2726 | 2924 | 2820 | 3172 | 5434 | 5434 | 4383 |
4 rows × 21 columns
data.nunique()
Breathing Problem 2 Fever 2 Dry Cough 2 Sore throat 2 Running Nose 2 Asthma 2 Chronic Lung Disease 2 Headache 2 Heart Disease 2 Diabetes 2 Hyper Tension 2 Fatigue 2 Gastrointestinal 2 Abroad travel 2 Contact with COVID Patient 2 Attended Large Gathering 2 Visited Public Exposed Places 2 Family working in Public Exposed Places 2 Wearing Masks 1 Sanitization from Market 1 COVID-19 2 dtype: int64
data.isna().sum()
Breathing Problem 0 Fever 0 Dry Cough 0 Sore throat 0 Running Nose 0 Asthma 0 Chronic Lung Disease 0 Headache 0 Heart Disease 0 Diabetes 0 Hyper Tension 0 Fatigue 0 Gastrointestinal 0 Abroad travel 0 Contact with COVID Patient 0 Attended Large Gathering 0 Visited Public Exposed Places 0 Family working in Public Exposed Places 0 Wearing Masks 0 Sanitization from Market 0 COVID-19 0 dtype: int64
for i in data.columns:
print(data[i].value_counts(),"\n")
Yes 3620 No 1814 Name: Breathing Problem, dtype: int64 Yes 4273 No 1161 Name: Fever, dtype: int64 Yes 4307 No 1127 Name: Dry Cough, dtype: int64 Yes 3953 No 1481 Name: Sore throat, dtype: int64 Yes 2952 No 2482 Name: Running Nose, dtype: int64 No 2920 Yes 2514 Name: Asthma, dtype: int64 No 2869 Yes 2565 Name: Chronic Lung Disease, dtype: int64 Yes 2736 No 2698 Name: Headache, dtype: int64 No 2911 Yes 2523 Name: Heart Disease, dtype: int64 No 2846 Yes 2588 Name: Diabetes, dtype: int64 No 2771 Yes 2663 Name: Hyper Tension, dtype: int64 Yes 2821 No 2613 Name: Fatigue , dtype: int64 No 2883 Yes 2551 Name: Gastrointestinal , dtype: int64 No 2983 Yes 2451 Name: Abroad travel, dtype: int64 Yes 2726 No 2708 Name: Contact with COVID Patient, dtype: int64 No 2924 Yes 2510 Name: Attended Large Gathering, dtype: int64 Yes 2820 No 2614 Name: Visited Public Exposed Places, dtype: int64 No 3172 Yes 2262 Name: Family working in Public Exposed Places, dtype: int64 No 5434 Name: Wearing Masks, dtype: int64 No 5434 Name: Sanitization from Market, dtype: int64 Yes 4383 No 1051 Name: COVID-19, dtype: int64
px.histogram(data, x="COVID-19", color= 'COVID-19')
fig = go.Figure(data=[go.Pie(labels=data["COVID-19"].value_counts().index,values= data["COVID-19"].value_counts().values ,hole=.3, pull=[0.1,0.1],textinfo='label+percent')])
fig.update_layout(
title = "COVID_19 percentage of cases"
)
fig.show()
px.histogram(data, x='Breathing Problem', color = 'COVID-19')
px.histogram(data, x='Fever', color = 'COVID-19')
px.histogram(data, x='Dry Cough', color = 'COVID-19')
px.histogram(data, x='Sore throat', color = 'COVID-19')
data.columns
Index(['Breathing Problem', 'Fever', 'Dry Cough', 'Sore throat',
'Running Nose', 'Asthma', 'Chronic Lung Disease', 'Headache',
'Heart Disease', 'Diabetes', 'Hyper Tension', 'Fatigue ',
'Gastrointestinal ', 'Abroad travel', 'Contact with COVID Patient',
'Attended Large Gathering', 'Visited Public Exposed Places',
'Family working in Public Exposed Places', 'Wearing Masks',
'Sanitization from Market', 'COVID-19'],
dtype='object')
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 5434 entries, 0 to 5433 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Breathing Problem 5434 non-null object 1 Fever 5434 non-null object 2 Dry Cough 5434 non-null object 3 Sore throat 5434 non-null object 4 Running Nose 5434 non-null object 5 Asthma 5434 non-null object 6 Chronic Lung Disease 5434 non-null object 7 Headache 5434 non-null object 8 Heart Disease 5434 non-null object 9 Diabetes 5434 non-null object 10 Hyper Tension 5434 non-null object 11 Fatigue 5434 non-null object 12 Gastrointestinal 5434 non-null object 13 Abroad travel 5434 non-null object 14 Contact with COVID Patient 5434 non-null object 15 Attended Large Gathering 5434 non-null object 16 Visited Public Exposed Places 5434 non-null object 17 Family working in Public Exposed Places 5434 non-null object 18 Wearing Masks 5434 non-null object 19 Sanitization from Market 5434 non-null object 20 COVID-19 5434 non-null object dtypes: object(21) memory usage: 891.6+ KB
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
data['Breathing Problem']=le.fit_transform(data['Breathing Problem'])
data['Fever']=le.fit_transform(data['Fever'])
data['Gastrointestinal ']=le.fit_transform(data['Gastrointestinal '])
data['Contact with COVID Patient']=le.fit_transform(data['Contact with COVID Patient'])
data['Attended Large Gathering']=le.fit_transform(data['Attended Large Gathering'])
data['Visited Public Exposed Places']=le.fit_transform(data['Visited Public Exposed Places'])
data['Chronic Lung Disease']=le.fit_transform(data['Chronic Lung Disease'])
data['Headache']=le.fit_transform(data['Headache'])
data['Fatigue ']=le.fit_transform(data['Fatigue '])
data['Running Nose']=le.fit_transform(data['Running Nose'])
data['Asthma']=le.fit_transform(data['Asthma'])
data['Family working in Public Exposed Places']=le.fit_transform(data['Family working in Public Exposed Places'])
data['Wearing Masks']=le.fit_transform(data['Wearing Masks'])
data['Sanitization from Market']=le.fit_transform(data['Sanitization from Market'])
data['COVID-19']=le.fit_transform(data['COVID-19'])
data['Heart Disease']=le.fit_transform(data['Heart Disease'])
data['Diabetes']=le.fit_transform(data['Diabetes'])
data['Hyper Tension']=le.fit_transform(data['Hyper Tension'])
data['Abroad travel']=le.fit_transform(data['Abroad travel'])
data['Dry Cough']=le.fit_transform(data['Dry Cough'])
data['Sore throat']=le.fit_transform(data['Sore throat'])
data['Dry Cough']=le.fit_transform(data['Dry Cough'])
data['Sore throat']=le.fit_transform(data['Sore throat'])
data.head()
| Breathing Problem | Fever | Dry Cough | Sore throat | Running Nose | Asthma | Chronic Lung Disease | Headache | Heart Disease | Diabetes | ... | Fatigue | Gastrointestinal | Abroad travel | Contact with COVID Patient | Attended Large Gathering | Visited Public Exposed Places | Family working in Public Exposed Places | Wearing Masks | Sanitization from Market | COVID-19 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 1 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | ... | 1 | 1 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 1 |
| 1 | 1 | 1 | 1 | 1 | 0 | 1 | 1 | 1 | 0 | 0 | ... | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 1 |
| 2 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 0 | 1 | ... | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 3 | 1 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | ... | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 1 |
| 4 | 1 | 1 | 1 | 1 | 1 | 0 | 1 | 1 | 1 | 1 | ... | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 |
5 rows × 21 columns
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 5434 entries, 0 to 5433 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Breathing Problem 5434 non-null int32 1 Fever 5434 non-null int32 2 Dry Cough 5434 non-null int64 3 Sore throat 5434 non-null int64 4 Running Nose 5434 non-null int32 5 Asthma 5434 non-null int32 6 Chronic Lung Disease 5434 non-null int32 7 Headache 5434 non-null int32 8 Heart Disease 5434 non-null int32 9 Diabetes 5434 non-null int32 10 Hyper Tension 5434 non-null int32 11 Fatigue 5434 non-null int32 12 Gastrointestinal 5434 non-null int32 13 Abroad travel 5434 non-null int32 14 Contact with COVID Patient 5434 non-null int32 15 Attended Large Gathering 5434 non-null int32 16 Visited Public Exposed Places 5434 non-null int32 17 Family working in Public Exposed Places 5434 non-null int32 18 Wearing Masks 5434 non-null int32 19 Sanitization from Market 5434 non-null int32 20 COVID-19 5434 non-null int32 dtypes: int32(19), int64(2) memory usage: 488.3 KB
data.describe(include="all")
| Breathing Problem | Fever | Dry Cough | Sore throat | Running Nose | Asthma | Chronic Lung Disease | Headache | Heart Disease | Diabetes | ... | Fatigue | Gastrointestinal | Abroad travel | Contact with COVID Patient | Attended Large Gathering | Visited Public Exposed Places | Family working in Public Exposed Places | Wearing Masks | Sanitization from Market | COVID-19 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 5434.000000 | 5434.000000 | 5434.000000 | 5434.000000 | 5434.000000 | 5434.000000 | 5434.000000 | 5434.000000 | 5434.000000 | 5434.000000 | ... | 5434.000000 | 5434.000000 | 5434.000000 | 5434.000000 | 5434.000000 | 5434.000000 | 5434.000000 | 5434.0 | 5434.0 | 5434.000000 |
| mean | 0.666176 | 0.786345 | 0.792602 | 0.727457 | 0.543246 | 0.462643 | 0.472028 | 0.503497 | 0.464299 | 0.476261 | ... | 0.519139 | 0.469452 | 0.451049 | 0.501656 | 0.461907 | 0.518955 | 0.416268 | 0.0 | 0.0 | 0.806588 |
| std | 0.471621 | 0.409924 | 0.405480 | 0.445309 | 0.498172 | 0.498648 | 0.499263 | 0.500034 | 0.498770 | 0.499482 | ... | 0.499680 | 0.499112 | 0.497644 | 0.500043 | 0.498593 | 0.499687 | 0.492984 | 0.0 | 0.0 | 0.395009 |
| min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.000000 |
| 25% | 0.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.0 | 1.000000 |
| 50% | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | ... | 1.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 1.000000 | 0.000000 | 0.0 | 0.0 | 1.000000 |
| 75% | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | ... | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.0 | 0.0 | 1.000000 |
| max | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | ... | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.0 | 0.0 | 1.000000 |
8 rows × 21 columns
data=data.drop('Sanitization from Market',axis=1)
data=data.drop('Wearing Masks',axis=1)
data.hist(figsize=(20,20),bins=3);
data.corr()
| Breathing Problem | Fever | Dry Cough | Sore throat | Running Nose | Asthma | Chronic Lung Disease | Headache | Heart Disease | Diabetes | Hyper Tension | Fatigue | Gastrointestinal | Abroad travel | Contact with COVID Patient | Attended Large Gathering | Visited Public Exposed Places | Family working in Public Exposed Places | COVID-19 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Breathing Problem | 1.000000 | 0.089903 | 0.159562 | 0.303768 | 0.055190 | 0.075318 | -0.098291 | -0.062172 | -0.073366 | 0.055427 | 0.045256 | 0.000561 | -0.075390 | 0.117795 | 0.214634 | 0.200304 | 0.066688 | 0.018295 | 0.443764 |
| Fever | 0.089903 | 1.000000 | 0.127580 | 0.322235 | 0.081758 | 0.073953 | -0.025160 | -0.035416 | -0.031462 | 0.050286 | 0.079001 | -0.060458 | -0.008067 | 0.128726 | 0.164704 | 0.070490 | 0.002252 | 0.012102 | 0.352891 |
| Dry Cough | 0.159562 | 0.127580 | 1.000000 | 0.213907 | -0.030763 | 0.086843 | -0.043664 | -0.035912 | 0.047566 | -0.006593 | 0.081989 | -0.039909 | 0.008251 | 0.331418 | 0.128330 | 0.117963 | 0.086176 | 0.163102 | 0.464292 |
| Sore throat | 0.303768 | 0.322235 | 0.213907 | 1.000000 | 0.039450 | 0.081377 | -0.050440 | -0.015971 | 0.002177 | 0.001938 | 0.042811 | -0.023290 | 0.025886 | 0.205986 | 0.189251 | 0.216438 | 0.079055 | 0.104378 | 0.502848 |
| Running Nose | 0.055190 | 0.081758 | -0.030763 | 0.039450 | 1.000000 | -0.022763 | -0.014376 | 0.068479 | -0.056750 | 0.042961 | -0.020445 | 0.007026 | -0.014673 | 0.034526 | 0.003776 | 0.061099 | 0.032568 | -0.061323 | -0.005657 |
| Asthma | 0.075318 | 0.073953 | 0.086843 | 0.081377 | -0.022763 | 1.000000 | -0.033771 | 0.037064 | 0.076783 | -0.012060 | 0.017707 | 0.006564 | 0.101909 | 0.068286 | 0.005046 | -0.044592 | 0.020941 | -0.115679 | 0.089930 |
| Chronic Lung Disease | -0.098291 | -0.025160 | -0.043664 | -0.050440 | -0.014376 | -0.033771 | 1.000000 | -0.050480 | -0.039860 | 0.046789 | -0.010331 | -0.047655 | -0.050333 | -0.088854 | -0.062482 | -0.020548 | -0.093049 | 0.038343 | -0.056837 |
| Headache | -0.062172 | -0.035416 | -0.035912 | -0.015971 | 0.068479 | 0.037064 | -0.050480 | 1.000000 | 0.048471 | 0.032390 | -0.207489 | 0.052035 | 0.097778 | 0.043589 | -0.082101 | -0.162992 | -0.005790 | -0.012625 | -0.027793 |
| Heart Disease | -0.073366 | -0.031462 | 0.047566 | 0.002177 | -0.056750 | 0.076783 | -0.039860 | 0.048471 | 1.000000 | -0.032956 | 0.049139 | -0.058925 | 0.004121 | -0.020761 | -0.025593 | -0.045437 | 0.086169 | 0.035000 | 0.027072 |
| Diabetes | 0.055427 | 0.050286 | -0.006593 | 0.001938 | 0.042961 | -0.012060 | 0.046789 | 0.032390 | -0.032956 | 1.000000 | 0.042543 | -0.043903 | 0.040651 | 0.039013 | -0.085696 | -0.061650 | -0.078212 | 0.097696 | 0.040627 |
| Hyper Tension | 0.045256 | 0.079001 | 0.081989 | 0.042811 | -0.020445 | 0.017707 | -0.010331 | -0.207489 | 0.049139 | 0.042543 | 1.000000 | -0.027605 | -0.067972 | -0.016382 | 0.027307 | 0.002911 | 0.019174 | 0.048152 | 0.102575 |
| Fatigue | 0.000561 | -0.060458 | -0.039909 | -0.023290 | 0.007026 | 0.006564 | -0.047655 | 0.052035 | -0.058925 | -0.043903 | -0.027605 | 1.000000 | 0.009356 | -0.068401 | -0.027383 | -0.031058 | -0.009562 | -0.025623 | -0.044188 |
| Gastrointestinal | -0.075390 | -0.008067 | 0.008251 | 0.025886 | -0.014673 | 0.101909 | -0.050333 | 0.097778 | 0.004121 | 0.040651 | -0.067972 | 0.009356 | 1.000000 | 0.099577 | 0.025277 | -0.017251 | -0.061885 | -0.027603 | -0.003367 |
| Abroad travel | 0.117795 | 0.128726 | 0.331418 | 0.205986 | 0.034526 | 0.068286 | -0.088854 | 0.043589 | -0.020761 | 0.039013 | -0.016382 | -0.068401 | 0.099577 | 1.000000 | 0.080210 | 0.113399 | 0.069609 | 0.143094 | 0.443875 |
| Contact with COVID Patient | 0.214634 | 0.164704 | 0.128330 | 0.189251 | 0.003776 | 0.005046 | -0.062482 | -0.082101 | -0.025593 | -0.085696 | 0.027307 | -0.027383 | 0.025277 | 0.080210 | 1.000000 | 0.234649 | 0.079800 | 0.006909 | 0.357122 |
| Attended Large Gathering | 0.200304 | 0.070490 | 0.117963 | 0.216438 | 0.061099 | -0.044592 | -0.020548 | -0.162992 | -0.045437 | -0.061650 | 0.002911 | -0.031058 | -0.017251 | 0.113399 | 0.234649 | 1.000000 | 0.083795 | 0.063776 | 0.390145 |
| Visited Public Exposed Places | 0.066688 | 0.002252 | 0.086176 | 0.079055 | 0.032568 | 0.020941 | -0.093049 | -0.005790 | 0.086169 | -0.078212 | 0.019174 | -0.009562 | -0.061885 | 0.069609 | 0.079800 | 0.083795 | 1.000000 | 0.028486 | 0.119755 |
| Family working in Public Exposed Places | 0.018295 | 0.012102 | 0.163102 | 0.104378 | -0.061323 | -0.115679 | 0.038343 | -0.012625 | 0.035000 | 0.097696 | 0.048152 | -0.025623 | -0.027603 | 0.143094 | 0.006909 | 0.063776 | 0.028486 | 1.000000 | 0.160208 |
| COVID-19 | 0.443764 | 0.352891 | 0.464292 | 0.502848 | -0.005657 | 0.089930 | -0.056837 | -0.027793 | 0.027072 | 0.040627 | 0.102575 | -0.044188 | -0.003367 | 0.443875 | 0.357122 | 0.390145 | 0.119755 | 0.160208 | 1.000000 |
fig, ax = plt.subplots(figsize=(10,8))
ax = sns.heatmap(data.corr(), cmap="rocket")
data=data.drop('Running Nose',axis=1)
data=data.drop('Chronic Lung Disease',axis=1)
data=data.drop('Headache',axis=1)
data=data.drop('Heart Disease',axis=1)
data=data.drop('Diabetes',axis=1)
data=data.drop('Gastrointestinal ',axis=1)
data=data.drop('Asthma',axis=1)
x=data.drop('COVID-19',axis=1)
y=data['COVID-19']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.20)
x.columns
Index(['Breathing Problem', 'Fever', 'Dry Cough', 'Sore throat',
'Hyper Tension', 'Fatigue ', 'Abroad travel',
'Contact with COVID Patient', 'Attended Large Gathering',
'Visited Public Exposed Places',
'Family working in Public Exposed Places'],
dtype='object')
type(x_train)
pandas.core.frame.DataFrame
from sklearn.linear_model import LogisticRegression
log_cls = LogisticRegression(random_state = 10)
log_cls.fit(x_train, y_train)
y_pred = log_cls.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
print("\nF1 Score : ",f1_score(y_test, y_pred))
[[194 18] [ 21 854]] F1 Score : 0.9776760160274756
log_cls.feature_names_in_
array(['Breathing Problem', 'Fever', 'Dry Cough', 'Sore throat',
'Hyper Tension', 'Fatigue ', 'Abroad travel',
'Contact with COVID Patient', 'Attended Large Gathering',
'Visited Public Exposed Places',
'Family working in Public Exposed Places'], dtype=object)
acc_log_cls = log_cls.score(x_test, y_test)
print("Train Accuracy ",log_cls.score(x_train, y_train))
print("Test Accuracy ",log_cls.score(x_test, y_test))
Train Accuracy 0.9648033126293996 Test Accuracy 0.9641214351425943
#dictionary --> sort --> top 5 or 10 features
dicta = {}
dicta.clear()
for i in range(0,11):
# print( classifier.coef_[0][i], " <--- ",x.columns[i] )
dicta.update({log_cls.feature_names_in_[i] : abs(log_cls.coef_[0][i])})
print("\nTop Features in order")
dict(sorted(dicta.items(), key=lambda item: item[1],reverse=True))
Top Features in order
{'Abroad travel': 5.372912765719051,
'Attended Large Gathering': 5.077488056601601,
'Fever': 2.6501414475857326,
'Sore throat': 2.6466919628879277,
'Dry Cough': 2.6243375328714,
'Breathing Problem': 2.496947254454527,
'Contact with COVID Patient': 1.625022489400047,
'Family working in Public Exposed Places': 1.0419210208361478,
'Fatigue ': 0.03643879257919243,
'Visited Public Exposed Places': 0.036238658197606737,
'Hyper Tension': 0.014991447317832734}
log_cls.coef_
array([[ 2.49694725, 2.65014145, 2.62433753, 2.64669196, -0.01499145,
-0.03643879, 5.37291277, 1.62502249, 5.07748806, 0.03623866,
1.04192102]])
## Abroad travel, Attended Large Gathering, Fever, Dry Cough, Sore throat, Breathing Problem are the major factors that the logistic regression has considered while training the model
x.columns
Index(['Breathing Problem', 'Fever', 'Dry Cough', 'Sore throat',
'Hyper Tension', 'Fatigue ', 'Abroad travel',
'Contact with COVID Patient', 'Attended Large Gathering',
'Visited Public Exposed Places',
'Family working in Public Exposed Places'],
dtype='object')
## SVM
from sklearn.svm import SVC
svm_clf = SVC(kernel='linear')
# fitting x samples and y classes
svm_clf.fit(x, y)
SVC(kernel='linear')
## Features are NOT independent ?
pred = svm_clf.predict(x_test)
# check the accuracy on the training set
print("Train Accuracy ",svm_clf.score(x_train, y_train))
print("Test Accuracy ",svm_clf.score(x_test, y_test))
Train Accuracy 0.9682539682539683 Test Accuracy 0.9696412143514259
acc_svm_clf = svm_clf.score(x_test, y_test)
dicta = {}
dicta.clear()
for i in range(0,11):
# print( classifier.coef_[0][i], " <--- ",x.columns[i] )
dicta.update({svm_clf.feature_names_in_[i] : abs(svm_clf.coef_[0][i])})
print("\nTop Features in order")
dict(sorted(dicta.items(), key=lambda item: item[1],reverse=True))
Top Features in order
{'Attended Large Gathering': 3.998881391660475,
'Abroad travel': 2.000004449747624,
'Breathing Problem': 1.9996603622461926,
'Sore throat': 1.9995789626697462,
'Dry Cough': 1.9995320516416175,
'Fever': 1.9995181985347399,
'Family working in Public Exposed Places': 0.00020887689702986378,
'Contact with COVID Patient': 0.00014710696201447604,
'Visited Public Exposed Places': 0.00014356045880958845,
'Fatigue ': 6.209827473835361e-05,
'Hyper Tension': 5.723246616540223e-05}
#Train the model
from sklearn.ensemble import RandomForestClassifier
randf_model = RandomForestClassifier(n_estimators=1000)
#Fit
randf_model.fit(x_train, y_train)
#Score/Accuracy
acc_randomforest=randf_model.score(x_test, y_test)*100
acc_randomforest
97.42410303587856
acc_randf_model = randf_model.score(x_test, y_test)
print("Train Accuracy ",randf_model.score(x_train, y_train))
print("Test Accuracy ",randf_model.score(x_test, y_test))
Train Accuracy 0.9802162410858063 Test Accuracy 0.9742410303587856
dicta = {}
dicta.clear()
for i in range(0,11):
# print( classifier.coef_[0][i], " <--- ",x.columns[i] )
dicta.update({randf_model.feature_names_in_[i] : abs(randf_model.feature_importances_[i])})
print("\nTop Features in order")
dict(sorted(dicta.items(), key=lambda item: item[1],reverse=True))
Top Features in order
{'Breathing Problem': 0.17249560340237732,
'Abroad travel': 0.16842914501113226,
'Sore throat': 0.16393784895499064,
'Dry Cough': 0.13963722944902596,
'Attended Large Gathering': 0.10549212034101212,
'Fever': 0.08097135086503054,
'Contact with COVID Patient': 0.07506301714015166,
'Family working in Public Exposed Places': 0.026882853602896196,
'Fatigue ': 0.023774465688318033,
'Visited Public Exposed Places': 0.022915999513112247,
'Hyper Tension': 0.02040036603195308}
# KNN
error_rate = []
# Will take some time
for i in range(1, 10):
knn = KNeighborsClassifier(n_neighbors = i)
knn.fit(x_train, y_train)
pred_i = knn.predict(x_test)
error_rate.append(np.mean(pred_i != y_test))
plt.figure(figsize =(10, 6))
plt.plot(range(1, 10), error_rate, color ='blue',
linestyle ='dashed', marker ='o',
markerfacecolor ='red', markersize = 10)
plt.title('Error Rate vs. K Value')
plt.xlabel('K')
plt.ylabel('Error Rate')
Text(0, 0.5, 'Error Rate')
num_neighbor = 6
knn_clf = KNeighborsClassifier(n_neighbors = num_neighbor)
knn_clf.fit(x_train, y_train)
pred = knn_clf.predict(x_test)
print('WITH K = ',num_neighbor )
print('\n')
print(confusion_matrix(y_test, pred))
print('\n')
print(classification_report(y_test, pred))
WITH K = 6
[[205 7]
[ 19 856]]
precision recall f1-score support
0 0.92 0.97 0.94 212
1 0.99 0.98 0.99 875
accuracy 0.98 1087
macro avg 0.95 0.97 0.96 1087
weighted avg 0.98 0.98 0.98 1087
acc_knn_clf = knn_clf.score(x_test, y_test)
print("Train Accuracy ",knn_clf.score(x_train, y_train))
print("Test Accuracy ",knn_clf.score(x_test, y_test))
Train Accuracy 0.9786059351276742 Test Accuracy 0.9760809567617296
from sklearn import tree
d_t = tree.DecisionTreeClassifier(criterion= 'gini')
d_t.fit(x_train,y_train)
y_pred = d_t.predict(x_test)
#Score/Accuracy
acc_decisiontree=d_t.score(x_test, y_test)*100
acc_decisiontree
97.42410303587856
dicta = {}
dicta.clear()
for i in range(0,11):
# print( classifier.coef_[0][i], " <--- ",x.columns[i] )
dicta.update({d_t.feature_names_in_[i] : abs(d_t.feature_importances_[i])})
print("\nTop Features in order")
dict(sorted(dicta.items(), key=lambda item: item[1],reverse=True))
Top Features in order
{'Sore throat': 0.2727999178980045,
'Breathing Problem': 0.22542975014843122,
'Abroad travel': 0.21362696378121346,
'Dry Cough': 0.07933788753820542,
'Attended Large Gathering': 0.07166074650165857,
'Contact with COVID Patient': 0.0548523014046704,
'Fever': 0.03560931296377859,
'Fatigue ': 0.019134689111653308,
'Visited Public Exposed Places': 0.012846166366675555,
'Hyper Tension': 0.0075984049723293,
'Family working in Public Exposed Places': 0.007103859313379582}
acc_d_t = d_t.score(x_test, y_test)
print("Train Accuracy ",d_t.score(x_train, y_train))
print("Test Accuracy ",d_t.score(x_test, y_test))
Train Accuracy 0.9802162410858063 Test Accuracy 0.9742410303587856
from sklearn.naive_bayes import GaussianNB
nb_model = GaussianNB()
nb_model.fit(x_train,y_train)
#Score/Accuracy
acc_gaussian= nb_model.score(x_test, y_test)*100
acc_gaussian
75.71297148114076
acc_nb_model=nb_model.score(x_test, y_test)
print("Train Accuracy ",nb_model.score(x_train, y_train))
print("Test Accuracy ",nb_model.score(x_test, y_test))
Train Accuracy 0.756383712905452 Test Accuracy 0.7571297148114076
#XGBoost
#!pip install xgboost
from xgboost import XGBClassifier
xgb_model = XGBClassifier()
xgb_model.fit(x_train, y_train)
preds = xgb_model.predict(x_test)
#xgb_model.accuracy_score(y_test, preds)
C:\Users\skaks\miniconda3\lib\site-packages\xgboost\compat.py:36: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead. C:\Users\skaks\miniconda3\lib\site-packages\xgboost\sklearn.py:1224: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1]. C:\Users\skaks\miniconda3\lib\site-packages\xgboost\data.py:262: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.
[16:16:55] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.5.1/src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
print("Train Accuracy ",xgb_model.score(x_train, y_train))
print("Test Accuracy ",xgb_model.score(x_test, y_test))
Train Accuracy 0.9802162410858063 Test Accuracy 0.9742410303587856
C:\Users\skaks\miniconda3\lib\site-packages\xgboost\data.py:262: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.
acc_xgb_model = xgb_model.score(x_test, y_test)
C:\Users\skaks\miniconda3\lib\site-packages\xgboost\data.py:262: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.
from xgboost import plot_importance
plot_importance(xgb_model, )
<AxesSubplot:title={'center':'Feature importance'}, xlabel='F score', ylabel='Features'>
# AdaBoost Algorithm
from sklearn.ensemble import AdaBoostClassifier
adb_model = AdaBoostClassifier()
# n_estimators = 50 (default value)
# base_estimator = DecisionTreeClassifier (default value)
adb_model.fit(x_train,y_train)
preds = adb_model.predict(x_test)
#adb_model.accuracy_score(y_test, preds)
dicta = {}
dicta.clear()
for i in range(0,11):
# print( classifier.coef_[0][i], " <--- ",x.columns[i] )
dicta.update({adb_model.feature_names_in_[i] : abs(adb_model.feature_importances_[i])})
print("\nTop Features in order")
dict(sorted(dicta.items(), key=lambda item: item[1],reverse=True))
Top Features in order
{'Attended Large Gathering': 0.26,
'Fever': 0.14,
'Dry Cough': 0.12,
'Sore throat': 0.12,
'Contact with COVID Patient': 0.1,
'Breathing Problem': 0.08,
'Hyper Tension': 0.06,
'Family working in Public Exposed Places': 0.06,
'Fatigue ': 0.02,
'Abroad travel': 0.02,
'Visited Public Exposed Places': 0.02}
print("Train Accuracy ",adb_model.score(x_train, y_train))
print("Test Accuracy ",adb_model.score(x_test, y_test))
Train Accuracy 0.9627329192546584 Test Accuracy 0.9613615455381784
acc_adb_model=adb_model.score(x_test, y_test)
t_feat = {'Features':
['Breathing Problem', #1
'Fever', #2
'Dry Cough', #3
'Sore throat',#4
'Hyper Tension',#5
'Fatigue ',#6
'Abroad travel',#7
'Contact with COVID Patient',#8
'Attended Large Gathering',#9
'Visited Public Exposed Places',#10
'Family working in Public Exposed Places']#11
,
'Count':[5,2,7,6,1,1,5,0,5,1 ,0]}
# 1,2,3,4,5,6,7,8,9,10,11
# Create DataFrame
top_feat = pd.DataFrame(t_feat)
top_feat
| Features | Count | |
|---|---|---|
| 0 | Breathing Problem | 5 |
| 1 | Fever | 2 |
| 2 | Dry Cough | 7 |
| 3 | Sore throat | 6 |
| 4 | Hyper Tension | 1 |
| 5 | Fatigue | 1 |
| 6 | Abroad travel | 5 |
| 7 | Contact with COVID Patient | 0 |
| 8 | Attended Large Gathering | 5 |
| 9 | Visited Public Exposed Places | 1 |
| 10 | Family working in Public Exposed Places | 0 |
fig, ax = plt.subplots(figsize=(10, 8))
# Plot horizontal bar graph
top_feat.sort_values(by='Count').plot.bar(x='Features',
y='Count',
ax=ax,
color="purple")
ax.set_title("Count of features apprearing in the top 5 most important features from all the trained models ")
plt.show()
## ROC curve n AUC (higher area, higher accuracy)
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from matplotlib import pyplot
ns_probs = [0 for _ in range(len(y_test))]
# fit a model
model = LogisticRegression(solver='lbfgs')
model.fit(x_train, y_train)
# predict probabilities
lr_probs = model.predict_proba(x_test)
# keep probabilities for the positive outcome only
lr_probs = lr_probs[:, 1]
# calculate scores
ns_auc = roc_auc_score(y_test, ns_probs)
lr_auc = roc_auc_score(y_test, lr_probs)
# summarize scores
print('No Skill: ROC AUC=%.3f' % (ns_auc))
print('Logistic: ROC AUC=%.3f' % (lr_auc))
# calculate roc curves
ns_fpr, ns_tpr, _ = roc_curve(y_test, ns_probs)
lr_fpr, lr_tpr, _ = roc_curve(y_test, lr_probs)
# plot the roc curve for the model
pyplot.plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill')
pyplot.plot(lr_fpr, lr_tpr, marker='.', label='Logistic')
# axis labels
pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')
# show the legend
pyplot.legend()
# show the plot
pyplot.show()
No Skill: ROC AUC=0.500 Logistic: ROC AUC=0.993
from sklearn.metrics import f1_score
from sklearn.metrics import auc
from matplotlib import pyplot
from sklearn.metrics import precision_recall_curve
model = LogisticRegression()
model.fit(x_train, y_train)
# predict probabilities
lr_probs = model.predict_proba(x_test)
# keep probabilities for the positive outcome only
lr_probs = lr_probs[:, 1]
# predict class values
yhat = model.predict(x_test)
# calculate precision and recall for each threshold
lr_precision, lr_recall, _ = precision_recall_curve(y_test, lr_probs)
# calculate scores
lr_f1, lr_auc = f1_score(y_test, yhat), auc(lr_recall, lr_precision)
# summarize scores
print('Logistic: f1=%.3f auc=%.3f' % (lr_f1, lr_auc))
# plot the precision-recall curves
no_skill = len(y_test[y_test==1]) / len(y_test)
pyplot.plot([0, 1], [no_skill, no_skill], linestyle='--', label='No Skill')
pyplot.plot(lr_recall, lr_precision, marker='.', label='Logistic')
# axis labels
pyplot.xlabel('Recall')
pyplot.ylabel('Precision')
# show the legend
pyplot.legend()
# show the plot
pyplot.show()
Logistic: f1=0.978 auc=0.998
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(hidden_layer_sizes=(8,8,8), activation='relu', solver='adam', max_iter=500)
mlp.fit(x_train,y_train)
predict_train = mlp.predict(x_train)
predict_test = mlp.predict(x_test)
print(confusion_matrix(y_train,predict_train))
print(classification_report(y_train,predict_train))
[[ 799 40]
[ 54 3454]]
precision recall f1-score support
0 0.94 0.95 0.94 839
1 0.99 0.98 0.99 3508
accuracy 0.98 4347
macro avg 0.96 0.97 0.97 4347
weighted avg 0.98 0.98 0.98 4347
print("Train Accuracy ",mlp.score(x_train, y_train))
print("Test Accuracy ",mlp.score(x_test, y_test))
Train Accuracy 0.9783758914193696 Test Accuracy 0.9760809567617296
acc_mlp = mlp.score(x_test, y_test)
Knn_pred = knn.predict(x)
Knn_pred
array([1, 1, 1, ..., 0, 0, 0])
x['KNN_Output'] = Knn_pred
x.head()
| Breathing Problem | Fever | Dry Cough | Sore throat | Hyper Tension | Fatigue | Abroad travel | Contact with COVID Patient | Attended Large Gathering | Visited Public Exposed Places | Family working in Public Exposed Places | KNN_Output | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 1 | 1 | 1 | 1 | 1 | 0 | 1 | 0 | 1 | 1 | 1 |
| 1 | 1 | 1 | 1 | 1 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 1 |
| 2 | 1 | 1 | 1 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 |
| 3 | 1 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 1 |
| 4 | 1 | 1 | 1 | 1 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 1 |
x_train1, x_test1, y_train1, y_test1 = train_test_split(x, y, test_size = 0.20)
mlp_knn = MLPClassifier(hidden_layer_sizes=(8,8,8), activation='relu', solver='adam', max_iter=500)
mlp_knn.fit(x_train1,y_train1)
predict_train = mlp_knn.predict(x_train1)
predict_test = mlp_knn.predict(x_test1)
print(confusion_matrix(y_train,predict_train))
print(classification_report(y_train,predict_train))
[[ 157 682]
[ 688 2820]]
precision recall f1-score support
0 0.19 0.19 0.19 839
1 0.81 0.80 0.80 3508
accuracy 0.68 4347
macro avg 0.50 0.50 0.50 4347
weighted avg 0.69 0.68 0.69 4347
print("Train Accuracy ",mlp_knn.score(x_train1, y_train1))
print("Test Accuracy ",mlp_knn.score(x_test1, y_test1))
Train Accuracy 0.9774557165861514 Test Accuracy 0.9760809567617296
acc_mlp_knn = mlp_knn.score(x_test1, y_test1)
models = pd.DataFrame({
'Model': ['Support Vector Machines', 'KNN', 'Logistic Regression','Random Forest', 'Naive Bayes', 'Decision Tree', 'XGBoost','AdaBoost', 'MLP Classifier','MLP with KNN as Feature'],
'Score': [acc_svm_clf,
acc_knn_clf,
acc_log_cls,
acc_randf_model,
acc_nb_model,
acc_d_t,
acc_xgb_model,
acc_adb_model,
acc_mlp,
acc_mlp_knn]})
models.sort_values(by='Score', ascending=False)
| Model | Score | |
|---|---|---|
| 1 | KNN | 0.976081 |
| 8 | MLP Classifier | 0.976081 |
| 9 | MLP with KNN as Feature | 0.976081 |
| 3 | Random Forest | 0.974241 |
| 5 | Decision Tree | 0.974241 |
| 6 | XGBoost | 0.974241 |
| 0 | Support Vector Machines | 0.969641 |
| 2 | Logistic Regression | 0.964121 |
| 7 | AdaBoost | 0.961362 |
| 4 | Naive Bayes | 0.757130 |